Setup notebook

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(phyloseq)
library(microbiome)
## 
## microbiome R package (microbiome.github.com)
##     
## 
## 
##  Copyright (C) 2011-2022 Leo Lahti, 
##     Sudarshan Shetty et al. <microbiome.github.io>
## 
## 
## Attaching package: 'microbiome'
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## 
## The following object is masked from 'package:base':
## 
##     transform

Functions

#input = dataframe of features x labels (including sample names)
# List of foods in order of model importance

plot_food_direction = function(food_ranking, data, variable) {
 
  for(food in food_ranking) {
  
    cols = colnames(data)
  
    if(food %in% cols){
     plot = data %>%
    select(variable, food) %>%
    ggplot()+
    geom_boxplot(aes_string(x= variable, y= food, colour = variable))+
      geom_jitter(aes_string(x= variable, y= food,  colour = variable), height=0)+
    theme_classic() 
  
    print (plot) 
   }
  } 
}

Load data

aha_pred = read_csv("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/Machine_learning/LAD_lab_ML/code/aha_to_choice_predictions.csv")
## Rows: 3420 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): sample, pred, id, treatment, true_week, timepoint
## dbl (4): Case, Control, auc, iteration
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
aha_importance = read_csv("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/Machine_learning/LAD_lab_ML/code/aha_to_choice_feature_importance.csv")
## Rows: 20 Columns: 191
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (1): label
## dbl (190): ATCCTATTATTTTATTATTTTACGAAACTAAACAAAGGTTCAGCAAGCGAGAATAATAAAAAAAG...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pomms_pred = read_csv("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/Machine_learning/LAD_lab_ML/code/pomms_to_choice_predictions.csv")
## Rows: 3420 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): sample, pred, id, treatment, true_week, timepoint
## dbl (4): Case, Control, auc, iteration
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pomms_importance = read_csv("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/Machine_learning/LAD_lab_ML/code/pomms_to_choice_feature_importance.csv")
## Rows: 20 Columns: 226
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (1): label
## dbl (225): ATCACGTTTTCCGAAAACAAACAAAGGTTCAGAAAGCGAAAATAAAAAAG, ATCCTTATTTTGA...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

AUC range

aha_pred %>%
  select(auc) %>%
  distinct() %>%
  ggplot()+
  geom_boxplot(aes(y= auc, x= "AHA"))+
  geom_point(aes(y= auc, x= "AHA"), alpha = 0.5)+
  labs(title = "AHA AUC range")+
  theme_classic()

pomms_pred %>%
  select(auc) %>%
  distinct() %>%
  ggplot()+
  geom_boxplot(aes(y= auc, x= "POMMS"))+
  geom_point(aes(y= auc, x= "POMMS"), alpha = 0.5)+
  labs(title = "POMMS AUC range")+
  theme_classic()

Best AUC

max = aha_pred$auc %>%
  max()

aha_pred %>%
  filter(auc == max) %>%
  ggplot()+
  geom_boxplot(aes(x= treatment, y= Case, colour = timepoint))+
  geom_point(aes(x= treatment, y= Case, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
  labs(title ="AHA to CHOICE")+
  theme_classic()

aha_pred %>%
  filter(auc == 0.94) %>%
  ggplot()+
  geom_boxplot(aes(x= treatment, y= Control, colour = timepoint))+
  geom_point(aes(x= treatment, y= Control, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
  labs(title ="AHA to CHOICE")+
  theme_classic()

max = pomms_pred$auc %>%
  max()

pomms_pred %>%
  filter(auc == max) %>%
  ggplot()+
  geom_boxplot(aes(x= treatment, y= Case, colour = timepoint))+
  geom_point(aes(x= treatment, y= Case, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
  labs(title ="POMMS to CHOICE")+
  theme_classic()

pomms_pred %>%
  filter(auc == 0.82) %>%
  ggplot()+
  geom_boxplot(aes(x= treatment, y= Control, colour = timepoint))+
  geom_point(aes(x= treatment, y= Control, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
  labs(title ="POMMS to CHOICE")+
  theme_classic()

#AUCs above 0.7

aha_pred %>%
  filter(auc > 0.7) %>%
  ggplot()+
  geom_boxplot(aes(x= treatment, y= Case, colour = timepoint))+
  geom_point(aes(x= treatment, y= Case, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
  labs(title ="AHA to CHOICE")+
  theme_classic()

aha_pred %>%
  filter(auc > 0.7) %>%
  ggplot()+
  geom_boxplot(aes(x= treatment, y= Control, colour = timepoint))+
  geom_point(aes(x= treatment, y= Control, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
  labs(title ="AHA to CHOICE")+
  theme_classic()

pomms_pred %>%
  filter(auc > 0.7) %>%
  ggplot()+
  geom_boxplot(aes(x= treatment, y= Case, colour = timepoint))+
  geom_point(aes(x= treatment, y= Case, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
  labs(title ="POMMS to CHOICE")+
  theme_classic()

pomms_pred %>%
  filter(auc > 0.7) %>%
  ggplot()+
  geom_boxplot(aes(x= treatment, y= Control, colour = timepoint))+
  geom_point(aes(x= treatment, y= Control, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
  labs(title ="POMMS to CHOICE")+
  theme_classic()

ASV lookup table

# combined = readRDS("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/CHOICE/All_cohorts/data_objects/CHOICE_POMMS_AHA_combined_20221213.rds")
# 
# lookup = combined %>%
#   tax_table() %>%
#   as.data.frame() %>%
#   as_tibble(rownames = NA) %>%
#   rownames_to_column("asv")
common_names = read_csv("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Brianna P/diet/food-dbs/data/outputs/dada2-compatible/trnL/trnLGH ASV common names.csv")
## Rows: 405 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): asv, name, taxon, common_name
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

CHOICE

c_ps = readRDS("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/CHOICE/CHOICE_Trnl/CHOICE_20220912/ps_objects/choice_complete_20221205_clean.rds")

otu_clr = abundances(c_ps, "clr") %>% # clr transform
  t()

c_ps = phyloseq(otu_table(otu_clr, taxa_are_rows = FALSE),
                   sample_data(sample_data(c_ps)),
                   tax_table(tax_table(c_ps)))
samdf = c_ps %>%
  sample_data() %>%
  as.data.frame() %>%
  as_tibble(rownames = NA) %>%
  rownames_to_column("sample") %>%
  select(sample, id, treatment, true_week, timepoint)

otudf = c_ps %>%
  otu_table() %>%
  as.data.frame() %>%
  as_tibble(rownames = NA) %>%
  rownames_to_column("sample")

input = samdf %>%
  left_join(otudf)
## Joining, by = "sample"

Best AUC importances

max = aha_pred$auc %>%
  max()

#colnames(aha_importance)

importance_summary = aha_importance %>%
  filter(auc == max) %>%
  pivot_longer(cols = c("ATCCTATTATTTTATTATTTTACGAAACTAAACAAAGGTTCAGCAAGCGAGAATAATAAAAAAAG":"CATAAACTGGTAGCGACCGGCACCACCGGTAAACTGATCGCAGAAGCTACCGGCT"), names_to = "food", values_to = "importance") %>%
  group_by(label, food) %>%
  summarise(mean_importance = mean(importance)) %>%
  arrange(label, desc(mean_importance))
## `summarise()` has grouped output by 'label'. You can override using the
## `.groups` argument.
ranking = importance_summary %>%
  pull(food)

top10 = ranking[1:10]

plot_food_direction(top10, input, "treatment")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(variable)
## 
##   # Now:
##   data %>% select(all_of(variable))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(food)
## 
##   # Now:
##   data %>% select(all_of(food))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation ideoms with `aes()`

name_list = c()
for(item in top10){
 name = common_names %>%
    filter(str_detect(asv, item)) %>%
    pull(common_name)
 name_list = append(name_list, name)
}
name_list
##  [1] "cocoa bean"                                                                                                                   
##  [2] "avocado, NA, NA, NA, NA, NA"                                                                                                  
##  [3] "NA, ceylon cinnamon, bay leaf, avocado, NA, NA"                                                                               
##  [4] "goji berry, NA, goji berry, cutleaf groundcherry, tomatillo, nightshade, NA, blackberry nightshade, potato, NA"               
##  [5] "garlic"                                                                                                                       
##  [6] "muscadine grape, common grape vine, NA, NA"                                                                                   
##  [7] "common grape vine"                                                                                                            
##  [8] "common grape vine"                                                                                                            
##  [9] "black chokeberry, apple, NA, crab apple, european pear, NA, afghan pear, nashi pear, siberian pear, hybrid chinese white pear"
## [10] "NA, pecan"                                                                                                                    
## [11] "sesame"
max = pomms_pred$auc %>%
  max()

#colnames(pomms_importance)

importance_summary = pomms_importance %>%
  filter(auc == max) %>%
  pivot_longer(cols = c("ATCACGTTTTCCGAAAACAAACAAAGGTTCAGAAAGCGAAAATAAAAAAG":"ATCCTGTTTTCTCAAAACAAACAAAGGTTCAGAAAAAAAG"), names_to = "food", values_to = "importance") %>%
  group_by(label, food) %>%
  summarise(mean_importance = mean(importance)) %>%
  arrange(label, desc(mean_importance))
## `summarise()` has grouped output by 'label'. You can override using the
## `.groups` argument.
ranking = importance_summary %>%
  pull(food)

top10 = ranking[1:10]

plot_food_direction(top10, input, "treatment")

name_list = c()
for(item in top10){
 name = common_names %>%
    filter(str_detect(asv, item)) %>%
    pull(common_name)
 name_list = append(name_list, name)
}
name_list
##  [1] "dandelion"                                                                                                                                                                                                                                             
##  [2] "sunflower, jerusalem artichoke, sunchoke, dandelion"                                                                                                                                                                                                   
##  [3] "ashanti pepper, long pepper, black pepper, wild betel"                                                                                                                                                                                                 
##  [4] "canola, rapeseed"                                                                                                                                                                                                                                      
##  [5] "brown mustard, canola, rapeseed, black mustard, cabbage, broccoli, cauliflower, kale, Brussels sprouts, collard greens, savoy, kohlrabi, gai lan, mustards, bok choy, canola, rapeseed, field mustard, napa cabbage, turnip, rocket, NA, white mustard"
##  [6] "oregano, NA, thyme"                                                                                                                                                                                                                                    
##  [7] "sage"                                                                                                                                                                                                                                                  
##  [8] "sage"                                                                                                                                                                                                                                                  
##  [9] "NA, NA, rosemary"                                                                                                                                                                                                                                      
## [10] "lemon balm, rosemary"                                                                                                                                                                                                                                  
## [11] "sesame"                                                                                                                                                                                                                                                
## [12] NA                                                                                                                                                                                                                                                      
## [13] NA                                                                                                                                                                                                                                                      
## [14] "NA, NA"                                                                                                                                                                                                                                                
## [15] NA                                                                                                                                                                                                                                                      
## [16] "rye, NA, NA, NA, NA, NA, bread wheat, NA, NA, NA, emmer wheat, spelt, einkorn, wild einkorn, spelt, NA, domesticated hulled wheat, NA, rivet wheat, durum wheat, wild einkorn, NA"                                                                     
## [17] "coconut"                                                                                                                                                                                                                                               
## [18] "fennel, dill, angelica, cumin, wild carrot, parsnip, parsley"                                                                                                                                                                                          
## [19] "lettuce"                                                                                                                                                                                                                                               
## [20] "flaxseed"                                                                                                                                                                                                                                              
## [21] "tomato, NA"

AUC above 0.7 importances

#colnames(aha_importance)

importance_summary = aha_importance %>%
  filter(auc >0.7) %>%
  pivot_longer(cols = c("ATCCTATTATTTTATTATTTTACGAAACTAAACAAAGGTTCAGCAAGCGAGAATAATAAAAAAAG":"CATAAACTGGTAGCGACCGGCACCACCGGTAAACTGATCGCAGAAGCTACCGGCT"), names_to = "food", values_to = "importance") %>%
  group_by(label, food) %>%
  summarise(mean_importance = mean(importance)) %>%
  arrange(label, desc(mean_importance))
## `summarise()` has grouped output by 'label'. You can override using the
## `.groups` argument.
ranking = importance_summary %>%
  pull(food)

top10 = ranking[1:10]

plot_food_direction(top10, input, "treatment")

name_list = c()
for(item in top10){
 name = common_names %>%
    filter(str_detect(asv, item)) %>%
    pull(common_name)
 name_list = append(name_list, name)
}
name_list
##  [1] "cocoa bean"                                                                                                                   
##  [2] "avocado, NA, NA, NA, NA, NA"                                                                                                  
##  [3] "NA, ceylon cinnamon, bay leaf, avocado, NA, NA"                                                                               
##  [4] "garlic"                                                                                                                       
##  [5] "goji berry, NA, goji berry, cutleaf groundcherry, tomatillo, nightshade, NA, blackberry nightshade, potato, NA"               
##  [6] "muscadine grape, common grape vine, NA, NA"                                                                                   
##  [7] "common grape vine"                                                                                                            
##  [8] "common grape vine"                                                                                                            
##  [9] "sesame"                                                                                                                       
## [10] "NA, pecan"                                                                                                                    
## [11] "black chokeberry, apple, NA, crab apple, european pear, NA, afghan pear, nashi pear, siberian pear, hybrid chinese white pear"
importance_summary = pomms_importance %>%
  filter(auc >0.7) %>%
  pivot_longer(cols = c("ATCACGTTTTCCGAAAACAAACAAAGGTTCAGAAAGCGAAAATAAAAAAG":"ATCCTGTTTTCTCAAAACAAACAAAGGTTCAGAAAAAAAG"), names_to = "food", values_to = "importance") %>%
  group_by(label, food) %>%
  summarise(mean_importance = mean(importance)) %>%
  arrange(label, desc(mean_importance))
## `summarise()` has grouped output by 'label'. You can override using the
## `.groups` argument.
ranking = importance_summary %>%
  pull(food)

top10 = ranking[1:10]

plot_food_direction(top10, input, "treatment")

name_list = c()
for(item in top10){
 name = common_names %>%
    filter(str_detect(asv, item)) %>%
    pull(common_name)
 name_list = append(name_list, name)
}
name_list
##  [1] "dandelion"                                                                                                                                                                                                                                             
##  [2] "sunflower, jerusalem artichoke, sunchoke, dandelion"                                                                                                                                                                                                   
##  [3] "tea"                                                                                                                                                                                                                                                   
##  [4] "ashanti pepper, long pepper, black pepper, wild betel"                                                                                                                                                                                                 
##  [5] "canola, rapeseed"                                                                                                                                                                                                                                      
##  [6] "brown mustard, canola, rapeseed, black mustard, cabbage, broccoli, cauliflower, kale, Brussels sprouts, collard greens, savoy, kohlrabi, gai lan, mustards, bok choy, canola, rapeseed, field mustard, napa cabbage, turnip, rocket, NA, white mustard"
##  [7] "rye, NA, NA, NA, NA, NA, bread wheat, NA, NA, NA, emmer wheat, spelt, einkorn, wild einkorn, spelt, NA, domesticated hulled wheat, NA, rivet wheat, durum wheat, wild einkorn, NA"                                                                     
##  [8] "fennel, dill, angelica, cumin, wild carrot, parsnip, parsley"                                                                                                                                                                                          
##  [9] "goji berry, NA, goji berry, cutleaf groundcherry, tomatillo, nightshade, NA, blackberry nightshade, potato, NA"                                                                                                                                        
## [10] "coconut"                                                                                                                                                                                                                                               
## [11] "lettuce"                                                                                                                                                                                                                                               
## [12] "oat"